1 Taxonomic analysis
2 Alpha-diversity analysis
3 Beta-diversity analysis
4 Microbial transmission prediction
- 4.1 Data processing
- 4.2 Visualization
SessionInfo

1 Taxonomic analysis

1.1 Open taxonomy count matrix table

library(devtools)
library(wilkoxmisc)
library(reshape2)
library(readr)
library(dplyr)
library(tidyr)
rm(list=ls())
# Read the metadata table
count_matrix <- read.table("Kraken_species_count_table.cast.txt", sep = "\t", header = T, row.names = 1)
count_matrix <- as.data.frame(t(count_matrix))
Species <- rep(NA, nrow(count_matrix))
count_matrix <- cbind(Species, count_matrix)
count_matrix[, 1] <- row.names(count_matrix)
rownames(count_matrix) <- NULL
# Calculate the count of each species in each sample and only remove taxa not identified
count_matrix <- count_matrix %>% gather(Sample, Count, 2:(ncol(count_matrix))) %>% filter(Count > 0)
count_matrix$Count <- as.integer(count_matrix$Count)
# Tabulate read counts by genus
count_matrix <- count_matrix %>% group_by(Sample, Species) %>% summarise(count = sum(Count))
## count_matrix <- ddply(count_matrix, .(Sample, Species), summarise, count = sum(Count))
# Convert count to relative abundance and add column (require dplyr)
library(plyr)
count_matrix <- ddply(count_matrix, .(Sample), mutate, RelativeAbundance = (count * 100) / sum(count))
# Save the processed table into txt file
write_tsv(count_matrix,"kraken_species_skin_abundance.tidy.txt")

1.2 Prepare the table for plotting

detach("package:plyr", unload = TRUE)
rm(list=ls())
# Read the previously saved txt file
count_matrix <- read_tsv("kraken_species_skin_abundance.tidy.txt")
# Collapse species table to only 5 or 8 top phyla, genus, family, etc.
count_matrix_table <- collapse_taxon_table(count_matrix, n = 12, Rank = "Species")
# Read the metadata table
Meta <- read.table("metadata_skin.tsv", sep = "\t", header = TRUE)
# Merge relative abundance table and metatable together
count_matrix_table <- merge(count_matrix_table, Meta, by = "Sample", all.x = TRUE)
# Save the table in txt file
write_tsv(count_matrix_table, "Top12Species.txt")

1.3 Visualization

rm(list=ls())
# Read the previously saved txt file
count_matrix_table <- read_tsv("Top12Species.txt")
# Plot the relative abundance
Plot1 <- ggplot(count_matrix_table, aes(x = Sample, y = RelativeAbundance, fill = Species)) +
  geom_bar(stat="identity") +
  scale_fill_brewer(palette = "Paired") +
  theme_classic() +
  facet_wrap(~Individual, scales = "free_x") +
  scale_y_continuous(expand = c(0, 0)) +
  ylab("Relative abundance (%)") +
  xlab("Sample") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 6))
# Save the plot in pdf file
ggsave("top12species_by_individual.pdf", dpi = 3000, width = 12, height = 8)
Plot1

1.4 Identify 3 species that could strongly indicate each individuals

library(indicspecies)
library(kableExtra)
rm(list=ls())
# Read the previously saved txt file
count_matrix_table <- read_tsv("Top12Species.txt")
# Group data by Sample and select the top 3 species with highest RelativeAbundance
top_species <- count_matrix_table %>%
  group_by(Individual) %>%
  top_n(3, wt = RelativeAbundance)
# Summarize the results as a table
indicator_table <- top_species %>%
  select(Individual, Species, RelativeAbundance) %>%
  arrange(Individual, desc(RelativeAbundance)) %>%
  group_by(Individual) %>%
  summarise(Top_3_Species = paste(Species, collapse = ", "),
            Top_3_RelativeAbundance = paste(RelativeAbundance, collapse = ", "))
# Print the summary table
write_tsv(top_species, "Top_species_table.txt")
write_tsv(indicator_table, "Indicator_table.txt")
top_species_table <- kable(top_species, format = "html") %>%
  kable_styling(bootstrap_options = "bordered", full_width = T, font_size = 10, 
                position = "center")
top_species_table

Sample	Species	RelativeAbundance	Type	Sample_type	Sampling.place.specific.details	Location	Sampling_day	Individual
SL336563	Janibacter_indicus	58.71032	Skin	Right palm	residence 3 right palm day 10 day	Residence 3	Day 10 day	Subject_3
SL336568	Janibacter_indicus	57.96876	Skin	Right palm	residence 3 right palm day 9 day	Residence 3	Day 9 day	Subject_3
SL336657	Minor/Unclassified	58.78512	Skin	Right palm	residence 1 right palm day 10 day	Residence 1	Day 10 day	Subject_1
SL336776	Minor/Unclassified	53.75796	Skin	Right palm	residence 3 right palm day 1 night	Residence 3	Day 1 night	Subject_3
SL336832	Xanthomonas_campestris	68.03215	Skin	Left palm	Residence 1 left palm day 2 night	Residence 1	Day 2 night	Subject_1
SL342547	Xanthomonas_campestris	58.71230	Skin	Left palm	residence 1 left palm day 3 day	Residence 1	Day 3 day	Subject_1
SL345916	Minor/Unclassified	62.12133	Skin	Right palm	residence 2 right palm day 6 day	Residence 2	Day 6 day	Subject_2
SL345921	Minor/Unclassified	56.26840	Skin	Right palm	residence 2 right palm day 3 day	Residence 2	Day 3 day	Subject_2
SL345955	Minor/Unclassified	55.98507	Skin	Right palm	residence 2 right palm day 5 night	Residence 2	Day 5 night	Subject_2
SL346089	Minor/Unclassified	77.26152	Skin	Left palm	residence 4 left palm day 7 night	Residence 4	Day 7 night	Subject_4
SL346118	Minor/Unclassified	66.33723	Skin	Right palm	residence 4 right palm day 9 day	Residence 4	Day 9 day	Subject_4
SL346163	Cutibacterium_acnes	68.27916	Skin	Right palm	residence 4 right palm day 8 day	Residence 4	Day 8 day	Subject_4

indicator_table_table <- kable(indicator_table, format = "html") %>%
  kable_styling(bootstrap_options = "bordered", full_width = T, font_size = 10, 
                position = "center")
indicator_table_table

Individual	Top_3_Species	Top_3_RelativeAbundance
Subject_1	Xanthomonas_campestris, Minor/Unclassified, Xanthomonas_campestris	68.0321529197282, 58.7851235672486, 58.7123032111771
Subject_2	Minor/Unclassified, Minor/Unclassified, Minor/Unclassified	62.1213306241151, 56.2683975989247, 55.9850685233147
Subject_3	Janibacter_indicus, Janibacter_indicus, Minor/Unclassified	58.710319700313, 57.9687644372183, 53.7579603501429
Subject_4	Minor/Unclassified, Cutibacterium_acnes, Minor/Unclassified	77.2615164452119, 68.279157624889, 66.3372318379562

2 Alpha-diversity analysis

2.1 Convert cast table into tidy

library(readr)
library(reshape2)
library(dplyr)
library(GUniFrac)
library(vegan)
library(stringr)
library(tidyr)
library(wilkoxmisc)
library(fossil)
library(OTUtable)
rm(list=ls())
# Same as the previous `Taxonomic analysis` section
table <- read.table("Kraken_species_count_table.cast.txt", sep = "\t", header = T, row.names = 1)
table <- as.data.frame(t(table))
Species <- rep(NA, nrow(table))
table <- cbind(Species, table)
table[, 1] <- row.names(table)
rownames(table) <- NULL
# Calculate the count of each species in each sample and only remove taxa not identified
table <- table %>% gather(Sample,Count, 2:(ncol(table))) %>% filter(Count > 0)
table$Count <- as.integer(table$Count)
# Cast the table into dataframe and rename their column names
table <- dcast(table, Sample~Species,value.var = "Count", fill = 0)
colnames(table) <- c("Sample", paste0("Species",1:(length(colnames(table))-1)))
# Save the casted dataframe
write_tsv(table,"kraken_report_all_species.cast.txt")
# Load the casted dataframe
table <- read.csv("kraken_report_all_species.cast.txt", header=TRUE, sep = "\t", row.names = 1, as.is=TRUE)
# rarefy to the minimal depth
table.rarefy <- Rarefy(table,depth = min(rowSums(table)))$otu.tab.rff
table.rarefy <- as.data.frame(as.table(table.rarefy))
names(table.rarefy)[1:2] <- c("Sample","Species")
names(table.rarefy)[3] <- c("Count")
table.rarefy <- table.rarefy %>% filter(!Count== 0)
write_tsv(table.rarefy,"kraken_report_all_species.rarefied.tidy.tsv") # for richness
table.rarefy.cast <- dcast(table.rarefy, Species ~Sample, value.var = "Count", fill = 0)
# Save the rarefied dataframe
write_tsv(table.rarefy.cast,"kraken_report_all_species.rarefied.cast.tsv") # for shannon and eveness
min(rowSums(table)) # depth we used in the rarefaction step

## [1] 82708

2.2 Rarefaction analysis

rm(list=ls())
# Calculate richness for rarefied table and save the result
richness <- read_tsv("kraken_report_all_species.rarefied.tidy.tsv") %>%
    select(Sample) %>%
    mutate(richness = 1) %>%
    group_by(Sample) %>%
    mutate(richness= sum(richness)) %>%
    ungroup() %>%
    unique()
write_tsv(richness,"gut_microbial_richness.txt")
# Calculate shannon diversity for rarefied table and save the result
alpha <- read.table("kraken_report_all_species.rarefied.cast.tsv", header=T, sep="\t", fill =TRUE, quote="", row.names = NULL)
Sample <- colnames(alpha[2:ncol(alpha)])
Species <- alpha[1:nrow(alpha),1]
names(alpha) <- NULL
alpha[,1] <- NULL
# Transpose the data frame
alpha_t <- data.frame(t(alpha))
alpha_t[is.na(alpha_t)] <- 0
colnames(alpha_t) <- Species
# Calculate the shannon diversity for global samples
Shannon <- diversity(alpha_t, "shannon")
# Merge the sample info with alpha diversity
data <- data.frame(cbind(Sample,Shannon))
write_tsv(data, "gut_microbial_shannon.txt")
# Calculate pielou's evenness and save the result
table <- read_tsv("kraken_report_all_species.rarefied.cast.tsv")
table[,1] <- NULL
evenness <- apply(table,2,pielou)
evenness <- as.data.frame(evenness)
evenness$Sample <- rownames(evenness)
write_tsv(evenness,"gut_microbial_pielou_evenness.txt")
# Merge richness, shannon index and evenness as the final alpha diversity result
richness <- read_tsv("gut_microbial_richness.txt")
shannon <- read_tsv("gut_microbial_shannon.txt")
merge <- left_join(richness,shannon)
evenness <- read_tsv("gut_microbial_pielou_evenness.txt")
merge <- left_join(merge, evenness)
write_tsv(merge,"gut_microbial_alpha_diversity_rarefied.txt")

2.3 Visualization

library(readr)
library(dplyr)
library(reshape2)
library(tidyr)
library(car)
library(viridis)
library(RColorBrewer)
rm(list=ls())
# Read the alpha diversity table and reformat it
table <- read_tsv("gut_microbial_alpha_diversity_rarefied.txt")
table <- table %>%
    gather(alpha, value, 2:(ncol(table))) %>%
    filter(value > 0)
# Read metadata information and merge alpha diversity result with meta information
meta <- read_tsv("metadata_skin.tsv")
table <- left_join(table,meta)
# Factor the day variable in the merged table
table$Sampling_day <- factor(table$Sampling_day, levels = c("Day 1 day", "Day 1 night", "Day 2 day", "Day 2 night", 
                                                            "Day 3 day", "Day 3 night", "Day 4 day", "Day 4 night", 
                                                            "Day 5 day", "Day 5 night", "Day 6 day", "Day 6 night", 
                                                            "Day 7 day", "Day 7 night", "Day 8 day", "Day 8 night", 
                                                            "Day 9 day", "Day 9 night", "Day 10 day", "Day 10 night"))
# Plot the richness and save it in pdf file
box_plot <- ggplot(table, aes(Individual, value)) +
  geom_boxplot(outlier.size = 0.5, outlier.shape = NA) +
  theme_classic() +
  facet_wrap(~alpha, scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab("Value") +
  xlab("Individual")
ggsave("alpha_diversity_by_individuals.pdf")
box_plot

2.4 Statistical test

# Statistical tests
table$Sampling_day_numeric <- as.numeric(table$Sampling_day)
subtable <- table %>% filter(alpha == "richness")
# day as a fixed categorical variable
lm1 <- lm(subtable$value~Individual+Sample_type+Sampling_day, data=subtable) 
# perform type I anova
anova1 <- anova(lm1)
anova1

## Analysis of Variance Table
## 
## Response: subtable$value
##               Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Individual     3 7529054 2509685 128.0997 < 2.2e-16 ***
## Sample_type    1   19252   19252   0.9826    0.3237    
## Sampling_day  19 1392252   73276   3.7402 5.942e-06 ***
## Residuals    110 2155081   19592                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# day as a fixed numeric variable
lm2 <- lm(subtable$value~Individual+Sample_type+Sampling_day_numeric, data=subtable) 
# perform type I anova
anova2 <- anova(lm2)
anova2

## Analysis of Variance Table
## 
## Response: subtable$value
##                       Df  Sum Sq Mean Sq F value Pr(>F)    
## Individual             3 7529054 2509685 90.5997 <2e-16 ***
## Sample_type            1   19252   19252  0.6950 0.4060    
## Sampling_day_numeric   1    1631    1631  0.0589 0.8086    
## Residuals            128 3545702   27701                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# day as a random numeric variable
lm3 <- lm(subtable$value~Individual+Sample_type+(1|Sampling_day_numeric), data=subtable) 
# perform type I anova
anova3 <- anova(lm3)
anova3

## Analysis of Variance Table
## 
## Response: subtable$value
##              Df  Sum Sq Mean Sq F value Pr(>F)    
## Individual    3 7529054 2509685 91.2655 <2e-16 ***
## Sample_type   1   19252   19252  0.7001 0.4043    
## Residuals   129 3547333   27499                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3 Beta-diversity analysis

3.1 Data processing

library(readr)
library(dplyr)
library(tidyr)
library(wilkoxmisc)
library(reshape2)
library(ggthemes)
library(vegan)
rm(list=ls())
# Read the previously saved result and cast it into dataframe format
tax <- read_tsv("kraken_species_skin_abundance.tidy.txt")
tax <- as.data.frame(tax)
tax_cast <- dcast(tax, Sample ~ Species, value.var = "count", fill = 0)
colnames(tax_cast) <- c("Sample", paste0("Species",1:(length(colnames(tax_cast))-1)))
# Save the reformat table
write_tsv(tax_cast, "kraken_species_skin_abundance_pcoa_rename.cast.txt")
# Replace " " with "_" before loading the table if necessary
tax_cast <- read.table("kraken_species_skin_abundance_pcoa_rename.cast.txt", header=TRUE, sep = "\t", row.names = 1, as.is=TRUE)
# Calculate the distance with bray curtis dissimilarity and jaccard based distance
distance_c <- vegdist(tax_cast, method="bray")
distance_j <- vegdist(tax_cast, method="jaccard")
UniFracMatrix_c <- as.matrix(distance_c)
UniFracMatrix_j <- as.matrix(distance_j)

3.2 Perform PERMANOVA and PCoA

# Read meta information
Meta <- read_tsv("metadata_skin.tsv")
# PERMANOVA
# Factor the day variable in the merged table
Meta$Sampling_day <- factor(Meta$Sampling_day, levels = c("Day 1 day", "Day 1 night", "Day 2 day", "Day 2 night", 
                                                          "Day 3 day", "Day 3 night", "Day 4 day", "Day 4 night", 
                                                          "Day 5 day", "Day 5 night", "Day 6 day", "Day 6 night", 
                                                          "Day 7 day", "Day 7 night", "Day 8 day", "Day 8 night", 
                                                          "Day 9 day", "Day 9 night", "Day 10 day", "Day 10 night"))
Meta$Sampling_day_numeric <- as.numeric(Meta$Sampling_day)
# day as a fixed numeric variable for bray curtis dissimilarity
adonisF_c <- adonis2(UniFracMatrix_c~Individual+Sample_type+Sampling_day_numeric,data=Meta)
adonisF_c

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = UniFracMatrix_c ~ Individual + Sample_type + Sampling_day_numeric, data = Meta)
##                       Df SumOfSqs      R2       F Pr(>F)    
## Individual             3   0.4834 0.01603  0.8033  0.707    
## Sample_type            1   0.0746 0.00247  0.3718  0.947    
## Sampling_day_numeric   1   3.9210 0.13004 19.5484  0.001 ***
## Residual             128  25.6741 0.85146                   
## Total                133  30.1530 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# day as a random numeric variable for bray curtis dissimilarity
adonisR_c <- adonis2(UniFracMatrix_c~Individual+Sample_type+(1|Sampling_day_numeric),data=Meta)
adonisR_c

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = UniFracMatrix_c ~ Individual + Sample_type + (1 | Sampling_day_numeric), data = Meta)
##              Df SumOfSqs      R2      F Pr(>F)
## Individual    3   0.4834 0.01603 0.7023  0.828
## Sample_type   1   0.0746 0.00247 0.3251  0.973
## Residual    129  29.5951 0.98150              
## Total       133  30.1530 1.00000

# day as a fixed numeric variable for jaccard based distance
adonisF_j <- adonis2(UniFracMatrix_j~Individual+Sample_type+Sampling_day_numeric,data=Meta)
adonisF_j

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = UniFracMatrix_j ~ Individual + Sample_type + Sampling_day_numeric, data = Meta)
##                       Df SumOfSqs      R2       F Pr(>F)    
## Individual             3    0.747 0.01819  0.8817  0.637    
## Sample_type            1    0.158 0.00384  0.5580  0.911    
## Sampling_day_numeric   1    4.005 0.09753 14.1797  0.001 ***
## Residual             128   36.154 0.88043                   
## Total                133   41.063 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# day as a random numeric variable for jaccard based distance
adonisR_j <- adonis2(UniFracMatrix_j~Individual+Sample_type+(1|Sampling_day_numeric),data=Meta)
adonisR_j

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = UniFracMatrix_j ~ Individual + Sample_type + (1 | Sampling_day_numeric), data = Meta)
##              Df SumOfSqs      R2      F Pr(>F)
## Individual    3    0.747 0.01819 0.7999  0.798
## Sample_type   1    0.158 0.00384 0.5063  0.949
## Residual    129   40.159 0.97797              
## Total       133   41.063 1.00000

# PCoA for bray curtis dissimilarity
PCoA_c <- cmdscale(UniFracMatrix_c, k = 2, eig = TRUE)
DF <- data.frame(Sample = row.names(PCoA_c$points), PCoA1 = PCoA_c$points[,1], 
                 PCoA2 = PCoA_c$points[,2], row.names = NULL)
Eigenvalues <- eigenvals(PCoA_c)
VarianceExplained <- Eigenvalues /sum(Eigenvalues)
VarianceExplained1_c <- 100 * signif(VarianceExplained[1], 2)
VarianceExplained2_c <- 100 * signif(VarianceExplained[2], 2)
PCoA_c <- merge(DF, Meta, by = "Sample", all.x = TRUE)
# PCoA for jaccard based distance
PCoA_j <- cmdscale(UniFracMatrix_j, k = 2, eig = TRUE)
DF <- data.frame(Sample = row.names(PCoA_j$points), PCoA1 = PCoA_j$points[,1], 
                 PCoA2 = PCoA_j$points[,2], row.names = NULL)
Eigenvalues <- eigenvals(PCoA_j)
VarianceExplained <- Eigenvalues /sum(Eigenvalues)
VarianceExplained1_j <- 100 * signif(VarianceExplained[1], 2)
VarianceExplained2_j <- 100 * signif(VarianceExplained[2], 2)
PCoA_j <- merge(DF, Meta, by = "Sample", all.x = TRUE)

3.3 Visualization

library(ggplot2)
# Plot the bray curtis dissimilarity PCoA result and save it in pdf file
xlab <- paste0("PCoA1 (Variance explained: ", VarianceExplained1_c, "%)")
ylab <- paste0("PCoA2 (Variance explained: ", VarianceExplained2_c, "%)")
custom_shapes <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
Plot_c <- ggplot(PCoA_c, aes(x = PCoA1, y = PCoA2, color = Individual)) +
  geom_point(size = 4, alpha = 0.8, aes(shape = Sampling_day)) +
  stat_ellipse(geom = "polygon", alpha = 0.05, aes(color = Individual)) +
  labs(title = "Bray-Curtis PCoA plot", x = xlab, y = ylab) +
  theme_bw() +
  theme(axis.title = element_text(size = 12),
        axis.text = element_text(size = 10)) +
  scale_shape_manual(values = custom_shapes)
ggsave("bray_curtis_pcoa_by_treatment_outcome_ellipse.pdf")
Plot_c

# Plot the binary Jaccard distance PCoA result and save it in pdf file
xlab <- paste0("PCoA1 (Variance explained: ", VarianceExplained1_j, "%)")
ylab <- paste0("PCoA2 (Variance explained: ", VarianceExplained2_j, "%)")
custom_shapes <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
Plot_j <- ggplot(PCoA_j, aes(x = PCoA1, y = PCoA2, color = Individual)) +
  geom_point(size = 4, alpha = 0.8, aes(shape = Sampling_day)) +
  stat_ellipse(geom = "polygon", alpha = 0.05, aes(color = Individual)) +
  labs(title = "Binary-Jaccard PCoA plot", x = xlab, y = ylab) +
  theme_bw() +
  theme(axis.title = element_text(size = 12),
        axis.text = element_text(size = 10)) +
  scale_shape_manual(values = custom_shapes)
ggsave("binary_jaccard_pcoa_by_treatment_outcome_ellipse.pdf")
Plot_j

4 Microbial transmission prediction

4.1 Data processing

# Read the metadata table
rm(list=ls())
genome_annotation_metadata <- read_tsv("genome_annotation_metadata.tsv")
genome_annotation_metadata <- genome_annotation_metadata %>%
  mutate(Sampling_day = case_when(
    grepl("Day \\d day", Sampling_day) ~ str_replace(Sampling_day, "Day (\\d) day", "Day 0\\1 day"),
    grepl("Day \\d night", Sampling_day) ~ str_replace(Sampling_day, "Day (\\d) night", "Day 0\\1 night"),
    TRUE ~ Sampling_day
  ))
# Calculate the average of the Completeness and Contamination for each Sampling_day and Sample_type combination
average_data <- genome_annotation_metadata %>%
  group_by(Sampling_day, Sample_type) %>%
  summarise(Average_Completeness = mean(Completeness, na.rm = TRUE),
            Average_Contamination = mean(Contamination, na.rm = TRUE))
top_n <- head(genome_annotation_metadata, n = 5)
styled_table <- kable(top_n, format = "html") %>%
  kable_styling(bootstrap_options = "bordered", full_width = T, font_size = 10, 
                position = "center")
styled_table

Genome	Completeness	Contamination	Taxonomy	Sample	Type	Sample_type	Surface_material	Sampling place specific details	Location	Sampling_day
SL336613_bin.3	53.83955	0.990991	s__Corynebacterium dentalis	SL336613	Indoor surface	Door knob	Metal	residence 1 door knob day 5 night	Residence 1	Day 05 night
SL336614_bin.3	97.18468	1.951952	s__Corynebacterium dentalis	SL336614	Skin	Left palm	Skin	residence 1 left palm day 6 night	Residence 1	Day 06 night
SL336616_bin.2	54.66216	1.306306	s__Corynebacterium dentalis	SL336616	Indoor surface	Door knob	Metal	residence 1 door knob day 8 night	Residence 1	Day 08 night
SL336620_bin.3	96.05856	1.388889	s__Corynebacterium dentalis	SL336620	Skin	Left palm	Skin	residence 1 left palm day 5 night	Residence 1	Day 05 night
SL336622_bin.7	64.88631	1.190476	s__Corynebacterium dentalis	SL336622	Skin	Left palm	Skin	residence 1 left palm day 7 night	Residence 1	Day 07 night

4.2 Visualization

library(ggplot2)
library(dplyr)
library(readr)
library(kableExtra)
library(stringr)
# Generate bubble plot
bubble_chart <- ggplot(average_data, aes(x=Sampling_day, y=Sample_type, 
                                         size=Average_Completeness, color=Average_Contamination)) +
  geom_point(alpha=1) +
  scale_size_continuous(name = "Completeness", range = c(0, 12)) +
  scale_color_gradient(name = "Contamination", low = "blue", high = "red") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right",
        legend.title = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8)) +
  labs(title = "Microbial Transmission Prediction", x = "Sampling Day", y = "Sample Type")
# Save the bubble plot in pdf file
ggsave("genome_annotation_metadata_bubble_plot.pdf", plot = bubble_chart, dpi = 300, width = 10, height = 5)
bubble_chart

# Generate line plot (Contamination ~ Sampling Day)
contamination_line_chart <- ggplot(average_data, aes(x=Sampling_day, y=Average_Contamination, color=Sample_type, group=Sample_type)) +
  geom_line() +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, linetype = 2) +
  scale_color_viridis_d() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right",
        legend.title = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8)) +
  labs(title = "Contamination Over Time by Sample Type",
       x = "Sampling Day",
       y = "Average Contamination",
       color = "Sample Type")
# Generate line plot (Completeness ~ Sampling Day)
completeness_line_chart <- ggplot(average_data, aes(x=Sampling_day, y=Average_Completeness, color=Sample_type, group=Sample_type)) +
  geom_line() +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, linetype = 2) +
  scale_color_viridis_d() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right",
        legend.title = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8)) +
  labs(title = "Completeness Over Time by Sample Type",
       x = "Sampling Day",
       y = "Average Completeness",
       color = "Sample Type")
# Save line plots in pdf file
ggsave("contamination_over_time_line_chart.pdf", plot = contamination_line_chart, dpi = 300, width = 10, height = 5)
ggsave("completeness_over_time_line_chart.pdf", plot = completeness_line_chart, dpi = 300, width = 10, height = 5)
contamination_line_chart

completeness_line_chart

SessionInfo

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8 
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C                               
## [5] LC_TIME=Chinese (Simplified)_China.utf8    
## 
## time zone: Etc/GMT-8
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0      RColorBrewer_1.1-3  viridis_0.6.5      
##  [4] viridisLite_0.4.2   car_3.1-2           carData_3.0-5      
##  [7] OTUtable_1.1.2      fossil_0.4.0        shapefiles_0.7.2   
## [10] foreign_0.8-86      maps_3.4.2          sp_2.1-4           
## [13] vegan_2.6-6         lattice_0.22-6      GUniFrac_1.8       
## [16] kableExtra_1.4.0    indicspecies_1.7.14 permute_0.9-7      
## [19] reshape2_1.4.4      wilkoxmisc_0.4      stringr_1.5.1      
## [22] dplyr_1.1.4         readr_2.1.5         tidyr_1.3.1        
## [25] knitr_1.46          ggplot2_3.5.1       devtools_2.4.5     
## [28] usethis_2.2.3      
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3       remotes_2.5.0       inline_0.3.19      
##  [4] rlang_1.1.3         magrittr_2.0.3      clue_0.3-65        
##  [7] matrixStats_1.3.0   compiler_4.4.0      mgcv_1.9-1         
## [10] systemfonts_1.1.0   vctrs_0.6.5         rmutil_1.1.10      
## [13] profvis_0.3.8       pkgconfig_2.0.3     crayon_1.5.2       
## [16] fastmap_1.1.1       ellipsis_0.3.2      labeling_0.4.3     
## [19] utf8_1.2.4          modeest_2.4.0       promises_1.3.0     
## [22] rmarkdown_2.26      sessioninfo_1.2.2   tzdb_0.4.0         
## [25] ragg_1.3.2          purrr_1.0.2         bit_4.0.5          
## [28] xfun_0.43           cachem_1.0.8        jsonlite_1.8.8     
## [31] highr_0.10          later_1.3.2         parallel_4.4.0     
## [34] cluster_2.1.6       R6_2.5.1            bslib_0.7.0        
## [37] stringi_1.8.4       rpart_4.1.23        pkgload_1.3.4      
## [40] jquerylib_0.1.4     Rcpp_1.0.12         iterators_1.0.14   
## [43] httpuv_1.6.15       Matrix_1.7-0        splines_4.4.0      
## [46] tidyselect_1.2.1    abind_1.4-5         rstudioapi_0.16.0  
## [49] yaml_2.3.8          timeDate_4032.109   codetools_0.2-20   
## [52] miniUI_0.1.1.1      pkgbuild_1.4.4      tibble_3.2.1       
## [55] plyr_1.8.9          shiny_1.8.1.1       withr_3.0.0        
## [58] evaluate_0.23       stable_1.1.6        urlchecker_1.0.1   
## [61] xml2_1.3.6          pillar_1.9.0        foreach_1.5.2      
## [64] generics_0.1.3      vroom_1.6.5         hms_1.1.3          
## [67] munsell_0.5.1       scales_1.3.0        timeSeries_4032.109
## [70] xtable_1.8-4        glue_1.7.0          statip_0.2.3       
## [73] tools_4.4.0         spatial_7.3-17      fBasics_4032.96    
## [76] fs_1.6.4            grid_4.4.0          ape_5.8            
## [79] colorspace_2.1-0    nlme_3.1-164        cli_3.6.2          
## [82] textshaping_0.3.7   fansi_1.0.6         svglite_2.1.3      
## [85] gtable_0.3.5        stabledist_0.7-1    sass_0.4.9         
## [88] digest_0.6.35       ggrepel_0.9.5       htmlwidgets_1.6.4  
## [91] farver_2.1.2        memoise_2.0.1       htmltools_0.5.8.1  
## [94] lifecycle_1.0.4     statmod_1.5.0       mime_0.12          
## [97] bit64_4.0.5         MASS_7.3-60.2

BIO316 Coursework3

High-Throughput Approaches and Systems Biology

Student id: 1931391

2024-05-30

1 Taxonomic analysis

1.1 Open taxonomy count matrix table

1.2 Prepare the table for plotting

1.3 Visualization

1.4 Identify 3 species that could strongly indicate each individuals

2 Alpha-diversity analysis

2.1 Convert cast table into tidy

2.2 Rarefaction analysis

2.3 Visualization

2.4 Statistical test

3 Beta-diversity analysis

3.1 Data processing

3.2 Perform PERMANOVA and PCoA

3.3 Visualization

4 Microbial transmission prediction

4.1 Data processing

4.2 Visualization

SessionInfo